# -*- coding: utf-8 -*-
"""
Created on Tue Mar 24 11:38:39 2020

@author: Administrator
"""

# -*- coding: utf-8 -*-
"""
Created on Tue Mar 24 10:30:16 2020

@author: Administrator
"""

print('开始采集')
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import random
import time
import datetime

now_time = datetime.datetime.now().strftime('%Y-%m-%d')
list1=[]
for j in range(136):
    # 设置随机暂停时间
    stop = random.uniform(1, 3)
    if j == 0:
        url = "https://www.zyctd.com/jiage/1-0-0.html"
    else:
        url = "https://www.zyctd.com/jiage/1-0-0-"+str(j+1)+".html"
    req = urllib.request.Request(url)
    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36")
    html = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(html)
    soup2 = soup.findAll("ul",class_= "priceTableRows")
    soup3 = str(soup2).split('<span class="w1">') 
    
    print('已经爬取天地药材网页数',j+1)
    for i in range(1,26):
        list2 = []
        soup4 = re.search(r'<a href="(.*?)" target="_blank" title="(.*?)">(.*?)</a></span>',str(soup3[i]))
        soup5 = re.search(r'<span class="w2"><a href="(.*?)" target="_blank" title="(.*?)">(.*?)</a></span>',str(soup3[i]))
        soup6 = re.search(r'<span class="w9" title="(.*?)">(.*?)</span>',str(soup3[i]))
        soup7 = re.search(r'<span class="w3">(.*?)</span>',str(soup3[i]))
        soup8 = re.search(r'<span class="w4">(.*?)</span>',str(soup3[i]))
        soup9 = re.search(r'<span class="w5"><em class="(.*?)">(.*?)</em></span>',str(soup3[i]))
        soup10 = re.search(r'<span class="w6"><em class="(.*?)">(.*?)</em></span>',str(soup3[i]))
        soup11 = re.search(r'<span class="w7"><em class="(.*?)">(.*?)</em></span>',str(soup3[i]))
        list2.append(soup4.group(2))
        list2.append(soup5.group(2))
        list2.append(soup6.group(2))
        list2.append(soup7.group(1))
        list2.append(soup8.group(1))
        list2.append(soup9.group(2))
        list2.append(soup10.group(2))
        list2.append(soup11.group(2))
        list2.append(now_time)
        list2.append('天地药材网')
        list1.append(list2)

    
import openpyxl
data = openpyxl.load_workbook('E:/数据/药材网/日/天药合并表.xlsx')
print(data.get_named_ranges()) # 输出工作页索引范围
print(data.get_sheet_names()) # 输出所有工作页的名称
# 取第一张表
sheetnames = data.get_sheet_names()
table = data.get_sheet_by_name(sheetnames[0])
table = data.active
print(table.title) # 输出表名
nrows = table.max_row # 获得行数
ncolumns = table.max_column # 获得行数
values = list1
for i in range(len(values)):
    for j in range(len(values[i])):
        table.cell(nrows+i+1,j+1).value = values[i][j]
data.save('E:/数据/药材网/日/天药合并表.xlsx')
print("今天完成一次数据采集!")
subject = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " 采集完成"
print(subject)